{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f12103c3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-04T12:01:27.548037Z",
     "iopub.status.busy": "2024-10-04T12:01:27.547076Z",
     "iopub.status.idle": "2024-10-04T12:01:27.580115Z",
     "shell.execute_reply": "2024-10-04T12:01:27.576106Z",
     "shell.execute_reply.started": "2024-10-04T12:01:27.548037Z"
    }
   },
   "source": [
    "This notebook takes raw data pulled from the Facebook marketing API. It performs a number of cleaning steps including:\n",
    "1. Dropping unnecessary or invariant columns.\n",
    "2. Averaging upper and lower bounds of MAU to get a single MAU estimate (for backward compatibility with data pulled from previous versions of the API)\n",
    "3. Standardize the coding and labeling of age ranges, genders, and education levels.\n",
    "4. Removing any strata that may be redundant (for example, a strata that covers \"all\" genders if we have separate \"male\" and \"female\" genders; otherwise people could be double counted in our audience estimates.\n",
    "5. Ensuring that all expected strata (and no duplicate strata) are present.\n",
    "6. Calculating population proportions across different demographics from the raw counts of users.\n",
    "This creates a uniform final data frame that is used for the graphics in the paper."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b28bdb63",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.171022Z",
     "iopub.status.busy": "2024-11-21T00:32:51.171022Z",
     "iopub.status.idle": "2024-11-21T00:32:51.642026Z",
     "shell.execute_reply": "2024-11-21T00:32:51.642026Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.171022Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.1.0\n",
      "1.19.2\n",
      "3.6.10 (default, Mar  5 2020, 10:17:47) [MSC v.1900 64 bit (AMD64)]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sys\n",
    "\n",
    "print(pd.__version__)\n",
    "print(np.__version__)\n",
    "print(sys.version) # Python version"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94c07afc",
   "metadata": {},
   "source": [
    "# Import Facebook data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d19a8606",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.645019Z",
     "iopub.status.busy": "2024-11-21T00:32:51.645019Z",
     "iopub.status.idle": "2024-11-21T00:32:51.720699Z",
     "shell.execute_reply": "2024-11-21T00:32:51.720639Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.645019Z"
    }
   },
   "outputs": [],
   "source": [
    "mex = pd.read_csv(\"mexico replication/mexico data/mexico_facebook_population.csv\", index_col=\"Unnamed: 0\")\n",
    "ind = pd.read_csv(\"indonesia replication/indonesia data/facebook_population_indonesia/indonesia_fb_with_edu.csv\")\n",
    "ken = pd.read_csv(\"kenya replication/data sets/facebook_population_kenya/dataframe_collected_finished_1591007152.csv\", index_col = \"Unnamed: 0\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "186d2318",
   "metadata": {},
   "source": [
    "# Clean Facebook data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "989c332d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.720699Z",
     "iopub.status.busy": "2024-11-21T00:32:51.720699Z",
     "iopub.status.idle": "2024-11-21T00:32:51.769387Z",
     "shell.execute_reply": "2024-11-21T00:32:51.769387Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.720699Z"
    }
   },
   "outputs": [],
   "source": [
    "for df in [ken, mex, ind]:\n",
    "    \n",
    "    # Remove empty columns\n",
    "    df.dropna(how='all', axis=1, inplace=True)\n",
    "    \n",
    "    # Remove invariant columns\n",
    "    for var in df.columns:\n",
    "        if len(df[var].unique()) == 1:\n",
    "            df.drop(labels = [var], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c8d34aba",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.769387Z",
     "iopub.status.busy": "2024-11-21T00:32:51.769387Z",
     "iopub.status.idle": "2024-11-21T00:32:51.786788Z",
     "shell.execute_reply": "2024-11-21T00:32:51.785824Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.769387Z"
    }
   },
   "outputs": [],
   "source": [
    "# Remove unnecessary variables \n",
    "mex.drop(columns=[\"dau_audience\", \"timestamp\"], inplace=True)\n",
    "ken.drop(columns = ['all_fields', 'targeting', 'response', 'dau_audience'], inplace=True)\n",
    "ind.drop(columns = ['all_fields', 'targeting', 'response', 'dau_audience'], inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5739bd76",
   "metadata": {},
   "source": [
    "## Recode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4d427bfd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.788414Z",
     "iopub.status.busy": "2024-11-21T00:32:51.788414Z",
     "iopub.status.idle": "2024-11-21T00:32:51.803474Z",
     "shell.execute_reply": "2024-11-21T00:32:51.801851Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.788414Z"
    }
   },
   "outputs": [],
   "source": [
    "#  For Indonesia, calculate monthly active users (MAU) as the middle between the lower and upper bounds\n",
    "# This is a change in how the Facebook Marketing API reports MAU, it now gives a range instead of a point estimate\n",
    "ind['mau_audience'] = (ind['mau_audience_upper_bound'] + ind['mau_audience_lower_bound'])/2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca57491a",
   "metadata": {},
   "source": [
    "### Age ranges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ca46100c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.803474Z",
     "iopub.status.busy": "2024-11-21T00:32:51.803474Z",
     "iopub.status.idle": "2024-11-21T00:32:51.837114Z",
     "shell.execute_reply": "2024-11-21T00:32:51.836082Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.803474Z"
    }
   },
   "outputs": [],
   "source": [
    "# Create a single age range variable for Mexico and standardize coding\n",
    "mex['age_range'] = mex['age_range_min'].astype(int).astype(str) + \"_\" + mex['age_range_max'].astype(str)\n",
    "\n",
    "mex['age_range'] = mex['age_range'].replace({'18_29.0': \"18-29\", \n",
    "                          '30_49.0': \"30-49\", \n",
    "                          '50_59.0': \"50-59\", \n",
    "                          '60_nan': \"60+\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b62fcadb",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.837114Z",
     "iopub.status.busy": "2024-11-21T00:32:51.837114Z",
     "iopub.status.idle": "2024-11-21T00:32:51.859537Z",
     "shell.execute_reply": "2024-11-21T00:32:51.859537Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.837114Z"
    }
   },
   "outputs": [],
   "source": [
    "# Standardize age range coding for Kenya\n",
    "ken['ages_ranges'] = ken['ages_ranges'].replace({\"{u'min': 18}\": \"18+\", \n",
    "                         \"{u'max': 29, u'min': 18}\": \"18-29\",\n",
    "                         \"{u'max': 49, u'min': 30}\": \"30-49\", \n",
    "                         \"{u'max': 59, u'min': 50}\": \"50-59\",\n",
    "                         \"{u'min': 60}\": \"60+\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4207d232",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.861627Z",
     "iopub.status.busy": "2024-11-21T00:32:51.861627Z",
     "iopub.status.idle": "2024-11-21T00:32:51.868814Z",
     "shell.execute_reply": "2024-11-21T00:32:51.868203Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.861627Z"
    }
   },
   "outputs": [],
   "source": [
    "# Standardize age range coding for Indonesia\n",
    "ind['ages_ranges'] = ind['ages_ranges'].replace({\n",
    "                         \"{'min': 18, 'max': 29}\": \"18-29\",\n",
    "                         \"{'min': 30, 'max': 49}\": \"30-49\", \n",
    "                         \"{'min': 50, 'max': 59}\": \"50-59\",\n",
    "                         \"{'min': 60}\": \"60+\"})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4666bb8",
   "metadata": {},
   "source": [
    "### Gender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ee91c350",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.870075Z",
     "iopub.status.busy": "2024-11-21T00:32:51.870075Z",
     "iopub.status.idle": "2024-11-21T00:32:51.894855Z",
     "shell.execute_reply": "2024-11-21T00:32:51.894855Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.870075Z"
    }
   },
   "outputs": [],
   "source": [
    "# Standardize gender coding\n",
    "genders = {1:'Male', 2:'Female', 0:'All'}\n",
    "ken['genders'] = ken.genders.replace(genders)\n",
    "ind['genders'] = ind.genders.replace(genders)\n",
    "\n",
    "mex['genders']= mex['genders'].str.title()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bca06c7d",
   "metadata": {},
   "source": [
    "## Handle duplicates"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd52cb9c",
   "metadata": {},
   "source": [
    "### Drop duplicate variables that might be problematic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "528e6cb6",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.894855Z",
     "iopub.status.busy": "2024-11-21T00:32:51.894855Z",
     "iopub.status.idle": "2024-11-21T00:32:51.920173Z",
     "shell.execute_reply": "2024-11-21T00:32:51.918831Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.894855Z"
    }
   },
   "outputs": [],
   "source": [
    "# Remove rows with counts across genders\n",
    "ind = ind[ind['genders']!=\"All\"].copy()\n",
    "mex = mex[mex['genders']!=\"All\"].copy()\n",
    "ken= ken[ken['genders']!=\"All\"].copy()\n",
    "\n",
    "# Remove rows with counts across age ranges\n",
    "ken = ken[ken['ages_ranges']!=\"18+\"].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a308d7b0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.920173Z",
     "iopub.status.busy": "2024-11-21T00:32:51.920173Z",
     "iopub.status.idle": "2024-11-21T00:32:51.935339Z",
     "shell.execute_reply": "2024-11-21T00:32:51.935339Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.920173Z"
    }
   },
   "outputs": [],
   "source": [
    "# Remove rows with no scholarity recorded\n",
    "ken=ken[~ken.scholarities.isna()]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5a65ed67",
   "metadata": {},
   "source": [
    "### (Optional quality check:) Ensure no duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "2d6681dc",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.935339Z",
     "iopub.status.busy": "2024-11-21T00:32:51.935339Z",
     "iopub.status.idle": "2024-11-21T00:32:51.951127Z",
     "shell.execute_reply": "2024-11-21T00:32:51.951127Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.935339Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "genders: ['Female', 'Male']\n",
      "age_range_min: [18, 30, 50, 60]\n",
      "age_range_max: [29.0, 49.0, 59.0, nan]\n",
      "education_level: ['Associate degree', 'College grad', 'Doctorate degree', 'High school grad', 'In college', 'In grad school', 'In high school', 'Masters degree', 'Professional degree', 'Some college', 'Some grad school', 'Some high school', 'Unspecified']\n",
      "region: [2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517, 2518, 2519, 2520, 2521, 2522, 2523, 2524, 2525, 2526, 2527, 2528, 2529, 2530, 2531, 2532, 2533, 2534, 2535, 2536]\n",
      "region_name: ['Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas', 'Chihuahua', 'Coahuila de Zaragoza', 'Colima', 'Distrito Federal', 'Durango', 'Guanajuato', 'Guerrero', 'Hidalgo', 'Jalisco', 'Michoacán de Ocampo', 'Morelos', 'México', 'Nayarit', 'Nuevo Leon', 'Oaxaca', 'Puebla', 'Querétaro Arteaga', 'Quintana Roo', 'San Luis Potosí', 'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas']\n",
      "age_range: ['18-29', '30-49', '50-59', '60+']\n"
     ]
    }
   ],
   "source": [
    "for var in mex:\n",
    "    if var not in [\"dau_audience\", \"mau_audience\", \"timestamp\"]:\n",
    "        print(f\"{var}: {sorted(mex[var].unique())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3390d9e7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.953359Z",
     "iopub.status.busy": "2024-11-21T00:32:51.953359Z",
     "iopub.status.idle": "2024-11-21T00:32:51.961906Z",
     "shell.execute_reply": "2024-11-21T00:32:51.961906Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.953359Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ages_ranges: ['18-29', '30-49', '50-59', '60+']\n",
      "genders: ['Female', 'Male']\n",
      "scholarities: [\"{u'name': u'Graduated', u'or': [3, 8, 9, 11]}\", \"{u'name': u'High School', u'or': [4, 5, 6, 10]}\", \"{u'name': u'In College', u'or': [2]}\", \"{u'name': u'In Grad School', u'or': [7]}\", \"{u'name': u'In High School', u'or': [1]}\", \"{u'name': u'No Degree', u'or': [13]}\", \"{u'name': u'Unspecified', u'or': [12]}\"]\n"
     ]
    }
   ],
   "source": [
    "for var in ken:\n",
    "    if var not in [ \"mau_audience\"]:\n",
    "        print(f\"{var}: {sorted(ken[var].astype(str).unique())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "16002089",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.965360Z",
     "iopub.status.busy": "2024-11-21T00:32:51.964992Z",
     "iopub.status.idle": "2024-11-21T00:32:51.994569Z",
     "shell.execute_reply": "2024-11-21T00:32:51.994569Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.965360Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ages_ranges: ['18-29', '30-49', '50-59', '60+']\n",
      "genders: ['Female', 'Male']\n",
      "scholarities: [\"{'or': [10], 'name': 'Professional degree'}\", \"{'or': [11], 'name': 'Doctorate degree'}\", \"{'or': [12], 'name': 'Unspecified'}\", \"{'or': [13], 'name': 'Some high school'}\", \"{'or': [1], 'name': 'In high school'}\", \"{'or': [2], 'name': 'In college'}\", \"{'or': [3], 'name': 'College grad'}\", \"{'or': [4], 'name': 'High school grad'}\", \"{'or': [5], 'name': 'Some college'}\", \"{'or': [6], 'name': 'Associate degree'}\", \"{'or': [7], 'name': 'In grad school'}\", \"{'or': [8], 'name': 'Some grad school'}\", \"{'or': [9], 'name': 'Masters degree'}\"]\n"
     ]
    }
   ],
   "source": [
    "for var in ind:\n",
    "    if var not in [ \"mau_audience\", 'mau_audience_upper_bound', 'mau_audience_lower_bound']:\n",
    "        print(f\"{var}: {sorted(ind[var].astype(str).unique())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8799f3e9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:51.994569Z",
     "iopub.status.busy": "2024-11-21T00:32:51.994569Z",
     "iopub.status.idle": "2024-11-21T00:32:52.020179Z",
     "shell.execute_reply": "2024-11-21T00:32:52.018601Z",
     "shell.execute_reply.started": "2024-11-21T00:32:51.994569Z"
    }
   },
   "outputs": [],
   "source": [
    "# Confirm again that there are no duplicate strata in the dataset\n",
    "assert(mex[['genders', 'age_range', 'education_level', 'region']].duplicated().max() == False)\n",
    "assert(ken[['genders', 'ages_ranges', 'scholarities']].duplicated().max() == False)\n",
    "assert(ind[['genders', 'ages_ranges', 'scholarities']].duplicated().max() == False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "dbe1fc7e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.022028Z",
     "iopub.status.busy": "2024-11-21T00:32:52.022028Z",
     "iopub.status.idle": "2024-11-21T00:32:52.042890Z",
     "shell.execute_reply": "2024-11-21T00:32:52.042890Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.022028Z"
    }
   },
   "outputs": [],
   "source": [
    "# Confirm that all strata are included in the dataset\n",
    "assert (ken.shape[0] == len(ken.genders.unique()) * len(ken.ages_ranges.unique()) * len(ken.scholarities.unique()))\n",
    "assert (mex.shape[0] == len(mex.genders.unique()) * len(mex.age_range.unique()) * len(mex.region_name.unique()) * len(mex.education_level.unique()))\n",
    "assert (ind.shape[0] == len(ind.genders.unique()) * len(ind.ages_ranges.unique()) * len(ind.scholarities.unique()))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "746f5fa3",
   "metadata": {},
   "source": [
    "# Regroup education levels together"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a4146ca8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.042890Z",
     "iopub.status.busy": "2024-11-21T00:32:52.042890Z",
     "iopub.status.idle": "2024-11-21T00:32:52.067556Z",
     "shell.execute_reply": "2024-11-21T00:32:52.067556Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.042890Z"
    }
   },
   "outputs": [],
   "source": [
    "mex['education_level'] = mex['education_level'].replace({\n",
    "                                'In high school'     : \"Did not complete secondary school\", \n",
    "                                'Some high school'   : \"Did not complete secondary school\",\n",
    "                                'High school grad'   : \"Secondary degree\", \n",
    "                                'Associate degree'   : \"Secondary degree\",\n",
    "                                'Professional degree': \"Secondary degree\",\n",
    "                                'In college'         : \"Secondary degree\", \n",
    "                                'Some college'       : \"Secondary degree\", \n",
    "                                'College grad'       : \"College degree or higher\", \n",
    "                                'In grad school'     : \"College degree or higher\", \n",
    "                                'Some grad school'   : \"College degree or higher\", \n",
    "                                'Masters degree'     : \"College degree or higher\", \n",
    "                                'Doctorate degree'   : \"College degree or highery\",  \n",
    "                                'Unspecified'        : \"Unspecified education\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "f9e3c8cf",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.067556Z",
     "iopub.status.busy": "2024-11-21T00:32:52.067556Z",
     "iopub.status.idle": "2024-11-21T00:32:52.084050Z",
     "shell.execute_reply": "2024-11-21T00:32:52.084050Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.067556Z"
    }
   },
   "outputs": [],
   "source": [
    "ken['scholarities'] = ken['scholarities'].replace( {\n",
    "                          \"{u'name': u'No Degree', u'or': [13]}\"           :\"Did not complete secondary school\", \n",
    "                          \"{u'name': u'In High School', u'or': [1]}\"       :\"Did not complete secondary school\",\n",
    "                          \"{u'name': u'High School', u'or': [4, 5, 6, 10]}\":\"Secondary degree\",\n",
    "                          \"{u'name': u'In College', u'or': [2]}\"           :\"Secondary degree\",\n",
    "                          \"{u'name': u'In Grad School', u'or': [7]}\"       :\"College degree or higher\",\n",
    "                          \"{u'name': u'Graduated', u'or': [3, 8, 9, 11]}\"  :\"College degree or higher\", \n",
    "                          \"{u'name': u'Unspecified', u'or': [12]}\"         :\"Unspecified education\", })\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "98098275",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.084050Z",
     "iopub.status.busy": "2024-11-21T00:32:52.084050Z",
     "iopub.status.idle": "2024-11-21T00:32:52.103755Z",
     "shell.execute_reply": "2024-11-21T00:32:52.103755Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.084050Z"
    }
   },
   "outputs": [],
   "source": [
    "ind['scholarities'] = ind['scholarities'].replace( {\n",
    "       \"{'or': [13], 'name': 'Some high school'}\"    :\"Did not complete secondary school\", \n",
    "     \"{'or': [1], 'name': 'In high school'}\"         :\"Did not complete secondary school\", \n",
    "       \"{'or': [2], 'name': 'In college'}\"           :\"Secondary degree\",\n",
    "       \"{'or': [4], 'name': 'High school grad'}\"     :\"Secondary degree\",\n",
    "       \"{'or': [5], 'name': 'Some college'}\"         :\"Secondary degree\",\n",
    "       \"{'or': [6], 'name': 'Associate degree'}\"     :\"Secondary degree\",\n",
    "       \"{'or': [10], 'name': 'Professional degree'}\" :\"Secondary degree\",\n",
    "       \"{'or': [3], 'name': 'College grad'}\"         :\"College degree or higher\",\n",
    "       \"{'or': [7], 'name': 'In grad school'}\"       :\"College degree or higher\",\n",
    "       \"{'or': [8], 'name': 'Some grad school'}\"     :\"College degree or higher\",\n",
    "       \"{'or': [9], 'name': 'Masters degree'}\"       :\"College degree or higher\",\n",
    "       \"{'or': [11], 'name': 'Doctorate degree'}\"    :\"College degree or higher\",\n",
    "       \"{'or': [12], 'name': 'Unspecified'}\"         :\"Unspecified education\",\n",
    "})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed4bcc23",
   "metadata": {},
   "source": [
    "# Export the aggregate"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f814b4ca",
   "metadata": {},
   "source": [
    "Starting from the individual counts of users, calculate the population proportion of different demographic categories."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "71634568",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.103755Z",
     "iopub.status.busy": "2024-11-21T00:32:52.103755Z",
     "iopub.status.idle": "2024-11-21T00:32:52.120240Z",
     "shell.execute_reply": "2024-11-21T00:32:52.120240Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.103755Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df = pd.DataFrame(columns = [\"mau_audience\",  'country'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "9fa82763",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.120240Z",
     "iopub.status.busy": "2024-11-21T00:32:52.120240Z",
     "iopub.status.idle": "2024-11-21T00:32:52.151948Z",
     "shell.execute_reply": "2024-11-21T00:32:52.151948Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.120240Z"
    }
   },
   "outputs": [],
   "source": [
    "# Mexico\n",
    "for var in ['genders', 'age_range', 'education_level']:\n",
    "    agg = mex.groupby(var).sum()[['mau_audience']]\n",
    "    agg = agg/agg.sum()\n",
    "    agg['country'] = 'Mexico'\n",
    "    final_df = final_df.append(agg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "344e07c4",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.153749Z",
     "iopub.status.busy": "2024-11-21T00:32:52.153749Z",
     "iopub.status.idle": "2024-11-21T00:32:52.175920Z",
     "shell.execute_reply": "2024-11-21T00:32:52.175920Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.153749Z"
    }
   },
   "outputs": [],
   "source": [
    "# Kenya \n",
    "for var in ['genders', 'ages_ranges', 'scholarities']:\n",
    "    agg = ken.groupby(var).sum()[['mau_audience']]\n",
    "    agg = agg/agg.sum()\n",
    "    agg['country'] = 'Kenya'\n",
    "    final_df = final_df.append(agg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "f0ec5c5c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.175920Z",
     "iopub.status.busy": "2024-11-21T00:32:52.175920Z",
     "iopub.status.idle": "2024-11-21T00:32:52.203756Z",
     "shell.execute_reply": "2024-11-21T00:32:52.203756Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.175920Z"
    }
   },
   "outputs": [],
   "source": [
    "# Indonesia\n",
    "for var in ['genders', 'ages_ranges', 'scholarities']:\n",
    "    agg = ind.groupby(var).sum()[['mau_audience']]\n",
    "    agg = agg/agg.sum()\n",
    "    agg['country'] = 'Indonesia'\n",
    "    final_df = final_df.append(agg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "20eaad35",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.203756Z",
     "iopub.status.busy": "2024-11-21T00:32:52.203756Z",
     "iopub.status.idle": "2024-11-21T00:32:52.211930Z",
     "shell.execute_reply": "2024-11-21T00:32:52.211930Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.203756Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df.index.names = [\"variable\"]\n",
    "final_df.rename(columns = {\"mau_audience\":'mean'}, inplace=True)\n",
    "final_df['source'] = \"Facebook Population\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "d73cf35f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-21T00:32:52.211930Z",
     "iopub.status.busy": "2024-11-21T00:32:52.211930Z",
     "iopub.status.idle": "2024-11-21T00:32:52.244704Z",
     "shell.execute_reply": "2024-11-21T00:32:52.244023Z",
     "shell.execute_reply.started": "2024-11-21T00:32:52.211930Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df.to_csv(\"data_exports/fig2_facebook_population.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
